import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale,StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
# get working directory
from pathlib import Path
print(Path.cwd())
C:\Users\dpapa\ANA 680_material Nov '22\WK4 FINAL
# or another way...
import os
cwd = os.getcwd()
print(cwd)
C:\Users\dpapa\ANA 680_material Nov '22\WK4 FINAL
# change working dir
os.chdir("C:\\Users\\dpapa")
cwd = os.getcwd()
print(cwd)
C:\Users\dpapa
# Load the dataset. Cells with content “NULL”, “?”, or “NONE” or just blank space will be interpreted as np.NAN
df=pd.read_csv(r"C:\Users\dpapa\water_quality_dataset.csv", na_values=['NULL', '?', 'NONE'])
df
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3271 | 4.668102 | 193.681735 | 47580.991603 | 7.166639 | 359.948574 | 526.424171 | 13.894419 | 66.687695 | 4.435821 | 1 |
| 3272 | 7.808856 | 193.553212 | 17329.802160 | 8.061362 | NaN | 392.449580 | 19.903225 | NaN | 2.798243 | 1 |
| 3273 | 9.419510 | 175.762646 | 33155.578218 | 7.350233 | NaN | 432.044783 | 11.039070 | 69.845400 | 3.298875 | 1 |
| 3274 | 5.126763 | 230.603758 | 11983.869376 | 6.303357 | NaN | 402.883113 | 11.168946 | 77.488213 | 4.708658 | 1 |
| 3275 | 7.874671 | 195.102299 | 17404.177061 | 7.509306 | NaN | 327.459760 | 16.140368 | 78.698446 | 2.309149 | 1 |
3276 rows × 10 columns
df.shape
(3276, 10)
df.columns
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
dtype='object')
columns =list(df.columns)
columns
['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability']
# look for missing values
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3276 entries, 0 to 3275 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 2785 non-null float64 1 Hardness 3276 non-null float64 2 Solids 3276 non-null float64 3 Chloramines 3276 non-null float64 4 Sulfate 2495 non-null float64 5 Conductivity 3276 non-null float64 6 Organic_carbon 3276 non-null float64 7 Trihalomethanes 3114 non-null float64 8 Turbidity 3276 non-null float64 9 Potability 3276 non-null int64 dtypes: float64(9), int64(1) memory usage: 256.1 KB
# find which column has missing values
print(df.isnull().sum().sort_values(ascending=False).to_string())
Sulfate 781 ph 491 Trihalomethanes 162 Hardness 0 Solids 0 Chloramines 0 Conductivity 0 Organic_carbon 0 Turbidity 0 Potability 0
# impute missing values for "Bare Nuclei" feature with the MEAN
missing_col = ['Sulfate','ph','Trihalomethanes']
for i in missing_col:
df.loc[df.loc[:,i].isnull(),i] = df.loc[:,i].mean()
# check if imputation was success by seeing if we still have missing values:
# find which column has missing values
print(df.isnull().sum().sort_values(ascending=False).to_string())
ph 0 Hardness 0 Solids 0 Chloramines 0 Sulfate 0 Conductivity 0 Organic_carbon 0 Trihalomethanes 0 Turbidity 0 Potability 0
# general statistics for our columns/features
df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ph | 3276.0 | 7.080795 | 1.469956 | 0.000000 | 6.277673 | 7.080795 | 7.870050 | 14.000000 |
| Hardness | 3276.0 | 196.369496 | 32.879761 | 47.432000 | 176.850538 | 196.967627 | 216.667456 | 323.124000 |
| Solids | 3276.0 | 22014.092526 | 8768.570828 | 320.942611 | 15666.690297 | 20927.833607 | 27332.762127 | 61227.196008 |
| Chloramines | 3276.0 | 7.122277 | 1.583085 | 0.352000 | 6.127421 | 7.130299 | 8.114887 | 13.127000 |
| Sulfate | 3276.0 | 333.775777 | 36.142612 | 129.000000 | 317.094638 | 333.775777 | 350.385756 | 481.030642 |
| Conductivity | 3276.0 | 426.205111 | 80.824064 | 181.483754 | 365.734414 | 421.884968 | 481.792304 | 753.342620 |
| Organic_carbon | 3276.0 | 14.284970 | 3.308162 | 2.200000 | 12.065801 | 14.218338 | 16.557652 | 28.300000 |
| Trihalomethanes | 3276.0 | 66.396293 | 15.769881 | 0.738000 | 56.647656 | 66.396293 | 76.666609 | 124.000000 |
| Turbidity | 3276.0 | 3.966786 | 0.780382 | 1.450000 | 3.439711 | 3.955028 | 4.500320 | 6.739000 |
| Potability | 3276.0 | 0.390110 | 0.487849 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
# histogram with specified number of bins
df.hist(bins=20, figsize=(15, 15))
array([[<AxesSubplot:title={'center':'ph'}>,
<AxesSubplot:title={'center':'Hardness'}>,
<AxesSubplot:title={'center':'Solids'}>],
[<AxesSubplot:title={'center':'Chloramines'}>,
<AxesSubplot:title={'center':'Sulfate'}>,
<AxesSubplot:title={'center':'Conductivity'}>],
[<AxesSubplot:title={'center':'Organic_carbon'}>,
<AxesSubplot:title={'center':'Trihalomethanes'}>,
<AxesSubplot:title={'center':'Turbidity'}>],
[<AxesSubplot:title={'center':'Potability'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)
cat_list=[]
num_list=[]
for column_name in df.columns:
unique_values=len(df[column_name].unique())
if unique_values<5:
cat_list.append(column_name)
else:
num_list.append(column_name)
k=1
plt.figure(figsize=(15,15))
plt.suptitle("Distribution of numerical variables")
for i in df.loc[:,num_list]:
plt.subplot(3,3,k)
sns.distplot(df[i])
plt.title(i)
k+=1
non_potable=df.query("Potability ==0")
potable=df.query("Potability==1")
plt.figure(figsize=(20,20))
for ax, col in enumerate(df.columns[:9]):
plt.subplot(3,3, ax + 1)
plt.title(col)
sns.kdeplot(x = non_potable[col], label = "Non Potable")
sns.kdeplot(x = potable[col], label = "Potable")
plt.legend()
plt.tight_layout()
# distribution for "Class"
op_count = df['Potability'].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(x=op_count.index,y= op_count.values)
plt.title('Potability')
plt.ylabel('occurances by potability value', fontsize=12)
plt.xlabel('potability', fontsize=12)
plt.xticks(ticks=[0, 1], labels = ["Not Potable [0]", "Potable [1]"])
plt.show()
import plotly.express as px
d = pd.DataFrame(df["Potability"].value_counts())
fig = px.pie(d, values = "Potability", names = ["Not Potable", "Potable"], hole = 0.35, opacity = 0.8,
labels = {"label" :"Potability","Potability":"Number of Samples"})
fig.update_layout(title = dict(text = "Pie Chart of Potability Feature"))
fig.update_traces(textposition = "outside", textinfo = "percent+label")
fig.show()
# generate boxplot for individual features
df.boxplot(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity'], figsize=(15, 10))
<AxesSubplot:>
# Examine outliers for each variable in more detail:
#pH column
fig, axes = plt.subplots(1,2)
sns.histplot(x = df['ph'], kde = True, ax = axes[0])
#define figure size
sns.set(rc={"figure.figsize":(15, 5)}) #width=6, height=5
sns.boxplot(x = df['ph'], ax = axes[1])
<AxesSubplot:xlabel='ph'>
# lets put this into a loop for the rest of the variables:
plt.figure(figsize=(10,10))
sns.set_style('dark')
i = 1
for col in df.columns:
fig, axes = plt.subplots(1,2)
sns.boxplot(x=df[col])
sns.histplot(x = df[col], kde = True, ax = axes[0])
i+=1
<Figure size 720x720 with 0 Axes>
# CLEAN OUTLIERS
# Identify values greater than max + 1.5* Inetrquartile range and less that min - 1.5* interquartile range
i = 1;
for col in df.columns:
for x in [col]:
q75,q25 = np.percentile(df.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df.loc[df[x] < min,x] = df[col].mean() # df.dropna(axis=0)
df.loc[df[x] > max,x] = df[col].mean()
print(df.isnull().sum().sort_values(ascending=False).to_string())
i+=1
ph 0 Hardness 0 Solids 0 Chloramines 0 Sulfate 0 Conductivity 0 Organic_carbon 0 Trihalomethanes 0 Turbidity 0 Potability 0
#lets recheck if outliers have been removed
# lets put this into a loop:
plt.figure(figsize=(18,24))
sns.set_style('dark')
i = 1
for col in df.columns:
fig, axes = plt.subplots(1,2)
sns.boxplot(x=df[col])
sns.histplot(x = df[col], kde = True, ax = axes[0])
i+=1
<Figure size 1296x1728 with 0 Axes>
# general statistics for our columns/features after inputation and outlier removal:
df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ph | 3276.0 | 7.074315 | 1.212530 | 3.902476 | 6.376857 | 7.080795 | 7.762369 | 10.252816 |
| Hardness | 3276.0 | 196.518697 | 29.401877 | 117.791230 | 178.222090 | 196.967627 | 215.593162 | 275.886513 |
| Solids | 3276.0 | 21629.750926 | 8137.573905 | 320.942611 | 15666.690297 | 20927.833607 | 26957.576932 | 44652.363872 |
| Chloramines | 3276.0 | 7.123431 | 1.445700 | 3.181183 | 6.179765 | 7.130299 | 8.076082 | 11.086526 |
| Sulfate | 3276.0 | 333.813477 | 25.541204 | 267.202392 | 321.856816 | 333.775777 | 343.827461 | 400.274579 |
| Conductivity | 3276.0 | 425.500720 | 79.449495 | 201.619737 | 365.811312 | 421.926811 | 480.855683 | 652.537592 |
| Organic_carbon | 3276.0 | 14.286139 | 3.193340 | 5.362371 | 12.094010 | 14.246387 | 16.517104 | 23.234326 |
| Trihalomethanes | 3276.0 | 66.468165 | 14.609623 | 27.095703 | 57.201524 | 66.396293 | 76.336831 | 106.371720 |
| Turbidity | 3276.0 | 3.967260 | 0.759414 | 1.872573 | 3.444882 | 3.959577 | 4.494481 | 6.083772 |
| Potability | 3276.0 | 0.390110 | 0.487849 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
import seaborn as sns
#plt.figure(figsize=(7,5))
sns.heatmap(df.corr()[['Potability']], annot=True)
plt.show()
# correlation matrix and heatmap for feature selection
corr=df.corr()
plt.figure(figsize=(10, 6))
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap="coolwarm")
print("Most positively correlated features with the target variable: Potability")
corr.sort_values(['Potability'], ascending=False, inplace=True)
corr.Potability
Most positively correlated features with the target variable: Potability
Potability 1.000000 Solids 0.025442 Chloramines 0.021136 Trihalomethanes 0.006346 Turbidity 0.004444 Sulfate 0.002171 ph -0.004099 Conductivity -0.008891 Hardness -0.012713 Organic_carbon -0.027319 Name: Potability, dtype: float64
# Only focus on features with correlation factor above 0.5
#top_feature = corr.index[corr['Potability']>0.2]
#plt.subplots(figsize=(10, 5))
#top_corr = df[top_feature].corr()
#sns.heatmap(top_corr, annot=True)
# Pairplot
sns.pairplot(df, hue = 'Potability')
<seaborn.axisgrid.PairGrid at 0x1510862ffd0>
X=df.drop("Potability",axis=1).values
y=df["Potability"].values
print(X)
print(y)
[[7.08079450e+00 2.04890455e+02 2.07913190e+04 ... 1.03797831e+01 8.69909705e+01 2.96313538e+00] [7.08079450e+00 1.29422921e+02 1.86300579e+04 ... 1.51800131e+01 5.63290763e+01 4.50065627e+00] [8.09912419e+00 2.24236259e+02 1.99095417e+04 ... 1.68686369e+01 6.64200925e+01 3.05593375e+00] ... [9.41951032e+00 1.75762646e+02 3.31555782e+04 ... 1.10390697e+01 6.98454003e+01 3.29887550e+00] [5.12676292e+00 2.30603758e+02 1.19838694e+04 ... 1.11689462e+01 7.74882131e+01 4.70865847e+00] [7.87467136e+00 1.95102299e+02 1.74041771e+04 ... 1.61403676e+01 7.86984463e+01 2.30914906e+00]] [0. 0. 0. ... 1. 1. 1.]
print(X[:,1])
[204.89045547 129.42292051 224.23625939 ... 175.7626463 230.60375751 195.10229859]
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)
print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)
# plot 1 variable: 1st which is 'ph' (all rows, first column)
sns.histplot(x = X_train[:,1], kde = True)
X_train (2620, 9) X_test (656, 9) y_train (2620,) y_test (656,)
<AxesSubplot:ylabel='Count'>
print(X_train)
[[5.79115437e+00 1.90431679e+02 2.02882356e+04 ... 1.77068383e+01 8.43881914e+01 4.10483482e+00] [7.08079450e+00 1.87043283e+02 2.46345524e+04 ... 1.10539246e+01 7.41081803e+01 3.49871682e+00] [8.39724810e+00 1.99495811e+02 1.67722262e+04 ... 1.72360472e+01 7.58535486e+01 2.83579428e+00] ... [7.38010483e+00 2.37922637e+02 1.68625435e+04 ... 1.32544754e+01 9.28645571e+01 5.03045220e+00] [8.03105522e+00 2.22628424e+02 1.81348350e+04 ... 1.20659628e+01 7.85497122e+01 3.33888013e+00] [6.20357285e+00 1.39129083e+02 6.69823910e+03 ... 1.33681645e+01 6.82986891e+01 4.30554936e+00]]
# min-max normalization
x_train_max = np.max(X_train)
x_train_min = np.min(X_train)
X_train = (X_train - x_train_min)/(x_train_max-x_train_min)
X_test = (X_test - x_train_min)/(x_train_max-x_train_min)
print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)
# plot 1 variable: 1st which is 'ph' (all rows, first column)
sns.histplot(x = X_train[:,1], kde = True)
X_train (2620, 9) X_test (656, 9) y_train (2620,) y_test (656,)
<AxesSubplot:ylabel='Count'>
print(X_train)
[[8.78391522e-05 4.22675167e-03 4.54740268e-01 ... 3.54941801e-04 1.84967481e-03 5.00385170e-05] [1.16747799e-04 4.15079720e-03 5.52167555e-01 ... 2.05809709e-04 1.61923750e-03 3.64517414e-05] [1.46257498e-04 4.42993377e-03 3.75925203e-01 ... 3.44388521e-04 1.65836177e-03 2.15916321e-05] ... [1.23457156e-04 5.29131169e-03 3.77949759e-01 ... 2.55137381e-04 2.03968151e-03 7.07872090e-05] [1.38048897e-04 4.94847575e-03 4.06469519e-01 ... 2.28495615e-04 1.71879913e-03 3.28688330e-05] [9.70839482e-05 3.07674976e-03 1.50106133e-01 ... 2.57685844e-04 1.48901161e-03 5.45377457e-05]]
models =[("LR", LogisticRegression()),
("SVC", SVC()),
("gbm",GradientBoostingClassifier()),
('KNN',KNeighborsClassifier()),
("DTC", DecisionTreeClassifier()),
("BNB", BernoulliNB()),
('RFC',RandomForestClassifier())]
results = []
names = []
finalResults = []
for name,model in models:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = round(accuracy_score(y_test,y_pred)*100,2)
results.append(accuracy)
names.append(name)
finalResults.append((name,"%s%%"%accuracy))
finalResults
[('LR', '60.52%'),
('SVC', '60.52%'),
('gbm', '61.59%'),
('KNN', '51.98%'),
('DTC', '55.79%'),
('BNB', '60.52%'),
('RFC', '64.18%')]
model = RandomForestClassifier (max_depth=200, random_state=0, n_estimators=10)
# train model with data
model.fit(X_train,y_train.ravel())
RandomForestClassifier(max_depth=200, n_estimators=10, random_state=0)
# make predictions
y_pred = model.predict(X_test)
df2 = pd.DataFrame(y_pred, columns = ['y-pred'])
vertical_concat = pd.concat([df2, pd.DataFrame(y_test,columns=['label'])], axis = 1 )
vertical_concat.head(10)
| y-pred | label | |
|---|---|---|
| 0 | 0.0 | 1.0 |
| 1 | 0.0 | 1.0 |
| 2 | 0.0 | 1.0 |
| 3 | 0.0 | 0.0 |
| 4 | 0.0 | 0.0 |
| 5 | 0.0 | 0.0 |
| 6 | 0.0 | 1.0 |
| 7 | 0.0 | 0.0 |
| 8 | 0.0 | 1.0 |
| 9 | 0.0 | 0.0 |
# comfusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
[[332 65] [209 50]]
# accuracy
score = model.score(X_test, y_test)
print(score*100,'%')
58.231707317073166 %
print (f'Train Accuracy - : {model.score(X_train,y_train)*100:.3f} %')
print (f'Test Accuracy - : {model.score(X_test,y_test)*100:.3f} %')
Train Accuracy - : 97.710 % Test Accuracy - : 58.232 %
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2s= r2_score(y_test, y_pred)
print(f' mse = {mse}, mae = {mae}, r2s={r2s}')
mse = 0.4176829268292683, mae = 0.4176829268292683, r2s=-0.7480913803331939
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the param grid
param_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print(param_grid)
{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}
rf_Model = RandomForestClassifier()
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 10, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train, y_train)
Fitting 10 folds for each of 320 candidates, totalling 3200 fits
GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=4,
param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2],
'min_samples_split': [2, 5],
'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72,
80]},
verbose=2)
rf_Grid.best_params_
{'bootstrap': False,
'max_depth': 4,
'max_features': 'auto',
'min_samples_leaf': 2,
'min_samples_split': 2,
'n_estimators': 17}
# put these new parameter values into our model again and re-run:
model = RandomForestClassifier (bootstrap =True,
max_depth = 2,
max_features = 'auto',
min_samples_leaf = 1,
min_samples_split = 2,
n_estimators =10)
# train model with data
model.fit(X_train,y_train.ravel())
RandomForestClassifier(max_depth=2, n_estimators=10)
# make predictions
y_pred = model.predict(X_test)
df2 = pd.DataFrame(y_pred, columns = ['y-pred'])
vertical_concat = pd.concat([df2, pd.DataFrame(y_test,columns=['label'])], axis = 1 )
vertical_concat.head(10)
| y-pred | label | |
|---|---|---|
| 0 | 0.0 | 1.0 |
| 1 | 0.0 | 1.0 |
| 2 | 0.0 | 1.0 |
| 3 | 0.0 | 0.0 |
| 4 | 0.0 | 0.0 |
| 5 | 0.0 | 0.0 |
| 6 | 0.0 | 1.0 |
| 7 | 0.0 | 0.0 |
| 8 | 0.0 | 1.0 |
| 9 | 0.0 | 0.0 |
# comfusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
[[394 3] [252 7]]
# accuracy
score = model.score(X_test, y_test)
print(score*100,'%')
61.12804878048781 %
print (f'Train Accuracy - : {model.score(X_train,y_train)*100:.3f} %')
print (f'Test Accuracy - : {model.score(X_test,y_test)*100:.3f} %')
Train Accuracy - : 61.489 % Test Accuracy - : 61.128 %
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2s= r2_score(y_test, y_pred)
print(f' mse = {mse}, mae = {mae}, r2s={r2s}')
mse = 0.38871951219512196, mae = 0.38871951219512196, r2s=-0.6268733649086293
from sklearn.model_selection import RandomizedSearchCV
rf_RandomGrid = RandomizedSearchCV(estimator = rf_Model, param_distributions = param_grid, cv = 10, verbose=2, n_jobs = 4)
rf_RandomGrid.fit(X_train, y_train)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=4,
param_distributions={'bootstrap': [True, False],
'max_depth': [2, 4],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2],
'min_samples_split': [2, 5],
'n_estimators': [10, 17, 25, 33, 41, 48,
56, 64, 72, 80]},
verbose=2)
rf_RandomGrid.best_params_
{'n_estimators': 56,
'min_samples_split': 2,
'min_samples_leaf': 2,
'max_features': 'sqrt',
'max_depth': 4,
'bootstrap': True}
# put these new parameter values into our model again and re-run:
model = RandomForestClassifier (n_estimators = 10,
min_samples_split =2,
min_samples_leaf = 2,
max_features = 'sqrt',
max_depth = 4,
bootstrap = True)
# train model with data
model.fit(X_train,y_train.ravel())
RandomForestClassifier(max_depth=4, max_features='sqrt', min_samples_leaf=2,
n_estimators=10)
# make predictions
y_pred = model.predict(X_test)
df2 = pd.DataFrame(y_pred, columns = ['y-pred'])
vertical_concat = pd.concat([df2, pd.DataFrame(y_test,columns=['label'])], axis = 1 )
vertical_concat.head(10)
| y-pred | label | |
|---|---|---|
| 0 | 0.0 | 1.0 |
| 1 | 0.0 | 1.0 |
| 2 | 0.0 | 1.0 |
| 3 | 0.0 | 0.0 |
| 4 | 0.0 | 0.0 |
| 5 | 0.0 | 0.0 |
| 6 | 0.0 | 1.0 |
| 7 | 0.0 | 0.0 |
| 8 | 0.0 | 1.0 |
| 9 | 0.0 | 0.0 |
# comfusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
[[389 8] [243 16]]
# accuracy
score = model.score(X_test, y_test)
print(score*100,'%')
61.737804878048784 %
print (f'Train Accuracy - : {model.score(X_train,y_train)*100:.3f} %')
print (f'Test Accuracy - : {model.score(X_test,y_test)*100:.3f} %')
Train Accuracy - : 64.275 % Test Accuracy - : 61.738 %
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2s= r2_score(y_test, y_pred)
print(f' mse = {mse}, mae = {mae}, r2s={r2s}')
mse = 0.3826219512195122, mae = 0.3826219512195122, r2s=-0.6013537827139841
#import pickle
# #Open a file and use dump()
#with open('water_quality.pkl', 'wb') as file:
## A new file will be created
#pickle.dump(model, file)